listings %>%
glimpse()Rows: 6,296
Columns: 74
$ id <dbl> 2797791, 4990531, 6619374ā¦
$ listing_url <chr> "https://www.airbnb.com/rā¦
$ scrape_id <dbl> 2.021093e+13, 2.021093e+1ā¦
$ last_scraped <date> 2021-09-29, 2021-09-28, ā¦
$ name <chr> "Beijing Great Wall Escapā¦
$ description <chr> "A perfect escape only 2 ā¦
$ neighborhood_overview <chr> "Located in a small villaā¦
$ picture_url <chr> "https://a0.muscache.com/ā¦
$ host_id <dbl> 14311129, 25729513, 34492ā¦
$ host_url <chr> "https://www.airbnb.com/uā¦
$ host_name <chr> "Andrew", "Joel", "ä¹ę",ā¦
$ host_since <date> 2014-04-15, 2015-01-07, ā¦
$ host_location <chr> "Beijing, Beijing, China"ā¦
$ host_about <chr> "Been living in Beijing fā¦
$ host_response_time <chr> "within a few hours", "wiā¦
$ host_response_rate <chr> "100%", "100%", "N/A", "1ā¦
$ host_acceptance_rate <chr> "73%", "99%", "N/A", "100ā¦
$ host_is_superhost <lgl> FALSE, FALSE, FALSE, FALSā¦
$ host_thumbnail_url <chr> "https://a0.muscache.com/ā¦
$ host_picture_url <chr> "https://a0.muscache.com/ā¦
$ host_neighbourhood <chr> NA, "Shichahai", NA, NA, ā¦
$ host_listings_count <dbl> 1, 10, 2, 1, 3, 1, 5, 5, ā¦
$ host_total_listings_count <dbl> 1, 10, 2, 1, 3, 1, 5, 5, ā¦
$ host_verifications <chr> "['email', 'phone', 'reviā¦
$ host_has_profile_pic <lgl> TRUE, TRUE, TRUE, TRUE, Tā¦
$ host_identity_verified <lgl> TRUE, TRUE, TRUE, TRUE, Tā¦
$ neighbourhood <chr> "Beijing, China", "Beijinā¦
$ neighbourhood_cleansed <chr> "ęęåŗ / Huairou", "äøåā¦
$ neighbourhood_group_cleansed <lgl> NA, NA, NA, NA, NA, NA, Nā¦
$ latitude <dbl> 40.47329, 39.94193, 40.44ā¦
$ longitude <dbl> 116.5451, 116.3984, 116.0ā¦
$ property_type <chr> "Entire residential home"ā¦
$ room_type <chr> "Entire home/apt", "Entirā¦
$ accommodates <dbl> 10, 4, 15, 16, 12, 16, 12ā¦
$ bathrooms <lgl> NA, NA, NA, NA, NA, NA, Nā¦
$ bathrooms_text <chr> "1 bath", "1 bath", "4 baā¦
$ bedrooms <dbl> 3, 1, 4, 1, 4, 5, 2, 4, 4ā¦
$ beds <dbl> 3, 2, 4, 2, 5, 9, 11, 12,ā¦
$ amenities <chr> "[\"Dishes and silverwareā¦
$ price <chr> "$1,914.00", "$1,610.00",ā¦
$ minimum_nights <dbl> 1, 29, 1, 1, 1, 1, 1, 1, ā¦
$ maximum_nights <dbl> 1125, 365, 1125, 1125, 11ā¦
$ minimum_minimum_nights <dbl> 1, 29, 1, 1, 1, 1, 1, 1, ā¦
$ maximum_minimum_nights <dbl> 1, 29, 1, 1, 1, 1, 1, 1, ā¦
$ minimum_maximum_nights <dbl> 1125, 1125, 1125, 1125, 1ā¦
$ maximum_maximum_nights <dbl> 1125, 1125, 1125, 1125, 1ā¦
$ minimum_nights_avg_ntm <dbl> 1, 29, 1, 1, 1, 1, 1, 1, ā¦
$ maximum_nights_avg_ntm <dbl> 1125, 1125, 1125, 1125, 1ā¦
$ calendar_updated <lgl> NA, NA, NA, NA, NA, NA, Nā¦
$ has_availability <lgl> TRUE, TRUE, TRUE, TRUE, Tā¦
$ availability_30 <dbl> 20, 0, 25, 30, 24, 26, 24ā¦
$ availability_60 <dbl> 50, 0, 29, 60, 27, 56, 54ā¦
$ availability_90 <dbl> 53, 0, 29, 90, 27, 86, 84ā¦
$ availability_365 <dbl> 234, 118, 29, 365, 298, 3ā¦
$ calendar_last_scraped <date> 2021-09-29, 2021-09-28, ā¦
$ number_of_reviews <dbl> 56, 20, 3, 1, 2, 14, 61, ā¦
$ number_of_reviews_ltm <dbl> 3, 0, 0, 0, 1, 3, 15, 5, ā¦
$ number_of_reviews_l30d <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0ā¦
$ first_review <date> 2015-05-04, 2016-12-31, ā¦
$ last_review <date> 2019-05-11, 2020-07-17, ā¦
$ review_scores_rating <dbl> 4.63, 4.68, 5.00, 0.00, 5ā¦
$ review_scores_accuracy <dbl> 4.72, 4.71, 5.00, NA, 5.0ā¦
$ review_scores_cleanliness <dbl> 4.24, 4.82, 4.67, NA, 5.0ā¦
$ review_scores_checkin <dbl> 4.89, 5.00, 5.00, NA, 5.0ā¦
$ review_scores_communication <dbl> 4.92, 4.88, 5.00, NA, 5.0ā¦
$ review_scores_location <dbl> 4.91, 4.88, 5.00, NA, 5.0ā¦
$ review_scores_value <dbl> 4.30, 4.76, 4.33, NA, 5.0ā¦
$ license <lgl> NA, NA, NA, NA, NA, NA, Nā¦
$ instant_bookable <lgl> FALSE, TRUE, FALSE, TRUE,ā¦
$ calculated_host_listings_count <dbl> 1, 10, 1, 1, 3, 1, 1, 2, ā¦
$ calculated_host_listings_count_entire_homes <dbl> 1, 6, 1, 0, 3, 1, 1, 2, 3ā¦
$ calculated_host_listings_count_private_rooms <dbl> 0, 4, 0, 1, 0, 0, 0, 0, 0ā¦
$ calculated_host_listings_count_shared_rooms <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0ā¦
$ reviews_per_month <dbl> 0.72, 0.35, 0.11, 0.02, 0ā¦
listings %>%
skim()| Name | Piped data |
| Number of rows | 6296 |
| Number of columns | 74 |
| _______________________ | |
| Column type frequency: | |
| character | 23 |
| Date | 5 |
| logical | 9 |
| numeric | 37 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| listing_url | 0 | 1.00 | 36 | 37 | 0 | 6296 | 0 |
| name | 0 | 1.00 | 1 | 185 | 0 | 6117 | 0 |
| description | 265 | 0.96 | 2 | 1000 | 0 | 5166 | 0 |
| neighborhood_overview | 1139 | 0.82 | 2 | 1000 | 0 | 3790 | 0 |
| picture_url | 0 | 1.00 | 63 | 112 | 0 | 6032 | 0 |
| host_url | 0 | 1.00 | 41 | 43 | 0 | 2665 | 0 |
| host_name | 0 | 1.00 | 1 | 41 | 0 | 2299 | 0 |
| host_location | 4 | 1.00 | 2 | 40 | 0 | 41 | 0 |
| host_about | 3332 | 0.47 | 1 | 4820 | 0 | 1063 | 1 |
| host_response_time | 0 | 1.00 | 3 | 18 | 0 | 5 | 0 |
| host_response_rate | 0 | 1.00 | 2 | 4 | 0 | 37 | 0 |
| host_acceptance_rate | 0 | 1.00 | 2 | 4 | 0 | 46 | 0 |
| host_thumbnail_url | 0 | 1.00 | 55 | 106 | 0 | 2658 | 0 |
| host_picture_url | 0 | 1.00 | 57 | 109 | 0 | 2658 | 0 |
| host_neighbourhood | 5938 | 0.06 | 2 | 25 | 0 | 28 | 0 |
| host_verifications | 0 | 1.00 | 2 | 151 | 0 | 99 | 0 |
| neighbourhood | 1139 | 0.82 | 14 | 34 | 0 | 8 | 0 |
| neighbourhood_cleansed | 0 | 1.00 | 3 | 16 | 0 | 16 | 0 |
| property_type | 0 | 1.00 | 3 | 35 | 0 | 80 | 0 |
| room_type | 0 | 1.00 | 11 | 15 | 0 | 3 | 0 |
| bathrooms_text | 5 | 1.00 | 6 | 17 | 0 | 70 | 0 |
| amenities | 0 | 1.00 | 27 | 1206 | 0 | 5199 | 0 |
| price | 0 | 1.00 | 6 | 10 | 0 | 3029 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| last_scraped | 0 | 1.00 | 2021-09-28 | 2021-09-29 | 2021-09-28 | 2 |
| host_since | 0 | 1.00 | 2013-02-06 | 2021-09-19 | 2019-05-21 | 1370 |
| calendar_last_scraped | 0 | 1.00 | 2021-09-28 | 2021-09-29 | 2021-09-28 | 2 |
| first_review | 3203 | 0.49 | 2015-05-04 | 2021-09-28 | 2020-07-18 | 843 |
| last_review | 3203 | 0.49 | 2016-04-04 | 2021-09-28 | 2021-04-30 | 730 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| host_is_superhost | 0 | 1 | 0.24 | FAL: 4811, TRU: 1485 |
| host_has_profile_pic | 0 | 1 | 1.00 | TRU: 6288, FAL: 8 |
| host_identity_verified | 0 | 1 | 1.00 | TRU: 6279, FAL: 17 |
| neighbourhood_group_cleansed | 6296 | 0 | NaN | : |
| bathrooms | 6296 | 0 | NaN | : |
| calendar_updated | 6296 | 0 | NaN | : |
| has_availability | 0 | 1 | 1.00 | TRU: 6296 |
| license | 6296 | 0 | NaN | : |
| instant_bookable | 0 | 1 | 0.65 | TRU: 4111, FAL: 2185 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 4.068584e+07 | 9018880.17 | 2.797791e+06 | 3.513261e+07 | 4.309471e+07 | 4.883673e+07 | 5.245929e+07 | āāāā ā |
| scrape_id | 0 | 1.00 | 2.021093e+13 | 0.00 | 2.021093e+13 | 2.021093e+13 | 2.021093e+13 | 2.021093e+13 | 2.021093e+13 | āāāāā |
| host_id | 0 | 1.00 | 2.545746e+08 | 106847611.68 | 4.984459e+06 | 1.828459e+08 | 2.631712e+08 | 3.492971e+08 | 4.236643e+08 | āā āāā |
| host_listings_count | 0 | 1.00 | 7.470000e+00 | 14.76 | 0.000000e+00 | 1.000000e+00 | 5.000000e+00 | 9.000000e+00 | 2.570000e+02 | āāāāā |
| host_total_listings_count | 0 | 1.00 | 7.470000e+00 | 14.76 | 0.000000e+00 | 1.000000e+00 | 5.000000e+00 | 9.000000e+00 | 2.570000e+02 | āāāāā |
| latitude | 0 | 1.00 | 4.031000e+01 | 0.30 | 3.947000e+01 | 4.019000e+01 | 4.041000e+01 | 4.050000e+01 | 4.095000e+01 | āāāāā |
| longitude | 0 | 1.00 | 1.164300e+02 | 0.47 | 1.154400e+02 | 1.160200e+02 | 1.164200e+02 | 1.167000e+02 | 1.175000e+02 | āāāāā |
| accommodates | 0 | 1.00 | 7.100000e+00 | 5.11 | 1.000000e+00 | 2.000000e+00 | 5.000000e+00 | 1.200000e+01 | 1.600000e+01 | āāāāā |
| bedrooms | 61 | 0.99 | 3.060000e+00 | 2.49 | 1.000000e+00 | 1.000000e+00 | 2.000000e+00 | 5.000000e+00 | 2.500000e+01 | āāāāā |
| beds | 19 | 1.00 | 4.310000e+00 | 4.33 | 0.000000e+00 | 1.000000e+00 | 3.000000e+00 | 6.000000e+00 | 7.100000e+01 | āāāāā |
| minimum_nights | 0 | 1.00 | 1.380000e+00 | 9.33 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 3.650000e+02 | āāāāā |
| maximum_nights | 0 | 1.00 | 8.738100e+02 | 418.94 | 1.000000e+00 | 3.650000e+02 | 1.125000e+03 | 1.125000e+03 | 1.125000e+03 | āāāāā |
| minimum_minimum_nights | 0 | 1.00 | 1.360000e+00 | 9.30 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 3.650000e+02 | āāāāā |
| maximum_minimum_nights | 0 | 1.00 | 1.540000e+00 | 15.67 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+03 | āāāāā |
| minimum_maximum_nights | 0 | 1.00 | 9.331500e+02 | 378.79 | 1.000000e+00 | 1.125000e+03 | 1.125000e+03 | 1.125000e+03 | 1.125000e+03 | āāāāā |
| maximum_maximum_nights | 0 | 1.00 | 9.353200e+02 | 377.05 | 1.000000e+00 | 1.125000e+03 | 1.125000e+03 | 1.125000e+03 | 1.125000e+03 | āāāāā |
| minimum_nights_avg_ntm | 0 | 1.00 | 1.410000e+00 | 9.98 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 3.650000e+02 | āāāāā |
| maximum_nights_avg_ntm | 0 | 1.00 | 9.349400e+02 | 376.98 | 1.000000e+00 | 1.125000e+03 | 1.125000e+03 | 1.125000e+03 | 1.125000e+03 | āāāāā |
| availability_30 | 0 | 1.00 | 1.917000e+01 | 9.82 | 0.000000e+00 | 1.600000e+01 | 2.400000e+01 | 2.500000e+01 | 3.000000e+01 | ā āāāā |
| availability_60 | 0 | 1.00 | 4.524000e+01 | 16.59 | 0.000000e+00 | 3.500000e+01 | 5.300000e+01 | 5.500000e+01 | 6.000000e+01 | āāāāā |
| availability_90 | 0 | 1.00 | 7.160000e+01 | 24.32 | 0.000000e+00 | 6.300000e+01 | 8.300000e+01 | 8.500000e+01 | 9.000000e+01 | āāāāā |
| availability_365 | 0 | 1.00 | 2.493600e+02 | 126.21 | 0.000000e+00 | 1.530000e+02 | 3.370000e+02 | 3.590000e+02 | 3.650000e+02 | āāāāā |
| number_of_reviews | 0 | 1.00 | 3.300000e+00 | 11.44 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.000000e+00 | 4.600000e+02 | āāāāā |
| number_of_reviews_ltm | 0 | 1.00 | 1.450000e+00 | 4.37 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 9.700000e+01 | āāāāā |
| number_of_reviews_l30d | 0 | 1.00 | 1.300000e-01 | 0.64 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.600000e+01 | āāāāā |
| review_scores_rating | 3203 | 0.49 | 4.670000e+00 | 1.00 | 0.000000e+00 | 4.840000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | āāāāā |
| review_scores_accuracy | 3312 | 0.47 | 4.900000e+00 | 0.37 | 1.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | āāāāā |
| review_scores_cleanliness | 3312 | 0.47 | 4.870000e+00 | 0.39 | 1.000000e+00 | 4.920000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | āāāāā |
| review_scores_checkin | 3312 | 0.47 | 4.910000e+00 | 0.36 | 1.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | āāāāā |
| review_scores_communication | 3312 | 0.47 | 4.920000e+00 | 0.35 | 1.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | āāāāā |
| review_scores_location | 3312 | 0.47 | 4.860000e+00 | 0.38 | 1.000000e+00 | 4.890000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | āāāāā |
| review_scores_value | 3312 | 0.47 | 4.800000e+00 | 0.48 | 1.000000e+00 | 4.800000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | āāāāā |
| calculated_host_listings_count | 0 | 1.00 | 5.910000e+00 | 6.12 | 1.000000e+00 | 1.000000e+00 | 4.000000e+00 | 8.000000e+00 | 3.300000e+01 | āāāāā |
| calculated_host_listings_count_entire_homes | 0 | 1.00 | 2.350000e+00 | 3.83 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 2.000000e+00 | 3.100000e+01 | āāāāā |
| calculated_host_listings_count_private_rooms | 0 | 1.00 | 3.490000e+00 | 5.33 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 5.000000e+00 | 3.300000e+01 | āāāāā |
| calculated_host_listings_count_shared_rooms | 0 | 1.00 | 7.000000e-02 | 0.64 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 9.000000e+00 | āāāāā |
| reviews_per_month | 3203 | 0.49 | 5.200000e-01 | 0.84 | 2.000000e-02 | 1.100000e-01 | 2.600000e-01 | 6.100000e-01 | 1.600000e+01 | āāāāā |
Beijing has 6296 accomodations listed on Airbnb.
Since price is a quantitative variable, we need to make sure it is stored as numeric data num in the dataframe.
listings <- listings %>%
mutate(price = parse_number(price))typeof(listings$price)[1] "double"
skim(listings)| Name | listings |
| Number of rows | 6296 |
| Number of columns | 74 |
| _______________________ | |
| Column type frequency: | |
| character | 22 |
| Date | 5 |
| logical | 9 |
| numeric | 38 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| listing_url | 0 | 1.00 | 36 | 37 | 0 | 6296 | 0 |
| name | 0 | 1.00 | 1 | 185 | 0 | 6117 | 0 |
| description | 265 | 0.96 | 2 | 1000 | 0 | 5166 | 0 |
| neighborhood_overview | 1139 | 0.82 | 2 | 1000 | 0 | 3790 | 0 |
| picture_url | 0 | 1.00 | 63 | 112 | 0 | 6032 | 0 |
| host_url | 0 | 1.00 | 41 | 43 | 0 | 2665 | 0 |
| host_name | 0 | 1.00 | 1 | 41 | 0 | 2299 | 0 |
| host_location | 4 | 1.00 | 2 | 40 | 0 | 41 | 0 |
| host_about | 3332 | 0.47 | 1 | 4820 | 0 | 1063 | 1 |
| host_response_time | 0 | 1.00 | 3 | 18 | 0 | 5 | 0 |
| host_response_rate | 0 | 1.00 | 2 | 4 | 0 | 37 | 0 |
| host_acceptance_rate | 0 | 1.00 | 2 | 4 | 0 | 46 | 0 |
| host_thumbnail_url | 0 | 1.00 | 55 | 106 | 0 | 2658 | 0 |
| host_picture_url | 0 | 1.00 | 57 | 109 | 0 | 2658 | 0 |
| host_neighbourhood | 5938 | 0.06 | 2 | 25 | 0 | 28 | 0 |
| host_verifications | 0 | 1.00 | 2 | 151 | 0 | 99 | 0 |
| neighbourhood | 1139 | 0.82 | 14 | 34 | 0 | 8 | 0 |
| neighbourhood_cleansed | 0 | 1.00 | 3 | 16 | 0 | 16 | 0 |
| property_type | 0 | 1.00 | 3 | 35 | 0 | 80 | 0 |
| room_type | 0 | 1.00 | 11 | 15 | 0 | 3 | 0 |
| bathrooms_text | 5 | 1.00 | 6 | 17 | 0 | 70 | 0 |
| amenities | 0 | 1.00 | 27 | 1206 | 0 | 5199 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| last_scraped | 0 | 1.00 | 2021-09-28 | 2021-09-29 | 2021-09-28 | 2 |
| host_since | 0 | 1.00 | 2013-02-06 | 2021-09-19 | 2019-05-21 | 1370 |
| calendar_last_scraped | 0 | 1.00 | 2021-09-28 | 2021-09-29 | 2021-09-28 | 2 |
| first_review | 3203 | 0.49 | 2015-05-04 | 2021-09-28 | 2020-07-18 | 843 |
| last_review | 3203 | 0.49 | 2016-04-04 | 2021-09-28 | 2021-04-30 | 730 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| host_is_superhost | 0 | 1 | 0.24 | FAL: 4811, TRU: 1485 |
| host_has_profile_pic | 0 | 1 | 1.00 | TRU: 6288, FAL: 8 |
| host_identity_verified | 0 | 1 | 1.00 | TRU: 6279, FAL: 17 |
| neighbourhood_group_cleansed | 6296 | 0 | NaN | : |
| bathrooms | 6296 | 0 | NaN | : |
| calendar_updated | 6296 | 0 | NaN | : |
| has_availability | 0 | 1 | 1.00 | TRU: 6296 |
| license | 6296 | 0 | NaN | : |
| instant_bookable | 0 | 1 | 0.65 | TRU: 4111, FAL: 2185 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 4.068584e+07 | 9018880.17 | 2.797791e+06 | 3.513261e+07 | 4.309471e+07 | 4.883673e+07 | 5.245929e+07 | āāāā ā |
| scrape_id | 0 | 1.00 | 2.021093e+13 | 0.00 | 2.021093e+13 | 2.021093e+13 | 2.021093e+13 | 2.021093e+13 | 2.021093e+13 | āāāāā |
| host_id | 0 | 1.00 | 2.545746e+08 | 106847611.68 | 4.984459e+06 | 1.828459e+08 | 2.631712e+08 | 3.492971e+08 | 4.236643e+08 | āā āāā |
| host_listings_count | 0 | 1.00 | 7.470000e+00 | 14.76 | 0.000000e+00 | 1.000000e+00 | 5.000000e+00 | 9.000000e+00 | 2.570000e+02 | āāāāā |
| host_total_listings_count | 0 | 1.00 | 7.470000e+00 | 14.76 | 0.000000e+00 | 1.000000e+00 | 5.000000e+00 | 9.000000e+00 | 2.570000e+02 | āāāāā |
| latitude | 0 | 1.00 | 4.031000e+01 | 0.30 | 3.947000e+01 | 4.019000e+01 | 4.041000e+01 | 4.050000e+01 | 4.095000e+01 | āāāāā |
| longitude | 0 | 1.00 | 1.164300e+02 | 0.47 | 1.154400e+02 | 1.160200e+02 | 1.164200e+02 | 1.167000e+02 | 1.175000e+02 | āāāāā |
| accommodates | 0 | 1.00 | 7.100000e+00 | 5.11 | 1.000000e+00 | 2.000000e+00 | 5.000000e+00 | 1.200000e+01 | 1.600000e+01 | āāāāā |
| bedrooms | 61 | 0.99 | 3.060000e+00 | 2.49 | 1.000000e+00 | 1.000000e+00 | 2.000000e+00 | 5.000000e+00 | 2.500000e+01 | āāāāā |
| beds | 19 | 1.00 | 4.310000e+00 | 4.33 | 0.000000e+00 | 1.000000e+00 | 3.000000e+00 | 6.000000e+00 | 7.100000e+01 | āāāāā |
| price | 0 | 1.00 | 2.417430e+03 | 2676.84 | 5.900000e+01 | 6.190000e+02 | 1.555000e+03 | 3.511500e+03 | 6.399500e+04 | āāāāā |
| minimum_nights | 0 | 1.00 | 1.380000e+00 | 9.33 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 3.650000e+02 | āāāāā |
| maximum_nights | 0 | 1.00 | 8.738100e+02 | 418.94 | 1.000000e+00 | 3.650000e+02 | 1.125000e+03 | 1.125000e+03 | 1.125000e+03 | āāāāā |
| minimum_minimum_nights | 0 | 1.00 | 1.360000e+00 | 9.30 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 3.650000e+02 | āāāāā |
| maximum_minimum_nights | 0 | 1.00 | 1.540000e+00 | 15.67 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+03 | āāāāā |
| minimum_maximum_nights | 0 | 1.00 | 9.331500e+02 | 378.79 | 1.000000e+00 | 1.125000e+03 | 1.125000e+03 | 1.125000e+03 | 1.125000e+03 | āāāāā |
| maximum_maximum_nights | 0 | 1.00 | 9.353200e+02 | 377.05 | 1.000000e+00 | 1.125000e+03 | 1.125000e+03 | 1.125000e+03 | 1.125000e+03 | āāāāā |
| minimum_nights_avg_ntm | 0 | 1.00 | 1.410000e+00 | 9.98 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 3.650000e+02 | āāāāā |
| maximum_nights_avg_ntm | 0 | 1.00 | 9.349400e+02 | 376.98 | 1.000000e+00 | 1.125000e+03 | 1.125000e+03 | 1.125000e+03 | 1.125000e+03 | āāāāā |
| availability_30 | 0 | 1.00 | 1.917000e+01 | 9.82 | 0.000000e+00 | 1.600000e+01 | 2.400000e+01 | 2.500000e+01 | 3.000000e+01 | ā āāāā |
| availability_60 | 0 | 1.00 | 4.524000e+01 | 16.59 | 0.000000e+00 | 3.500000e+01 | 5.300000e+01 | 5.500000e+01 | 6.000000e+01 | āāāāā |
| availability_90 | 0 | 1.00 | 7.160000e+01 | 24.32 | 0.000000e+00 | 6.300000e+01 | 8.300000e+01 | 8.500000e+01 | 9.000000e+01 | āāāāā |
| availability_365 | 0 | 1.00 | 2.493600e+02 | 126.21 | 0.000000e+00 | 1.530000e+02 | 3.370000e+02 | 3.590000e+02 | 3.650000e+02 | āāāāā |
| number_of_reviews | 0 | 1.00 | 3.300000e+00 | 11.44 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.000000e+00 | 4.600000e+02 | āāāāā |
| number_of_reviews_ltm | 0 | 1.00 | 1.450000e+00 | 4.37 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 9.700000e+01 | āāāāā |
| number_of_reviews_l30d | 0 | 1.00 | 1.300000e-01 | 0.64 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.600000e+01 | āāāāā |
| review_scores_rating | 3203 | 0.49 | 4.670000e+00 | 1.00 | 0.000000e+00 | 4.840000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | āāāāā |
| review_scores_accuracy | 3312 | 0.47 | 4.900000e+00 | 0.37 | 1.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | āāāāā |
| review_scores_cleanliness | 3312 | 0.47 | 4.870000e+00 | 0.39 | 1.000000e+00 | 4.920000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | āāāāā |
| review_scores_checkin | 3312 | 0.47 | 4.910000e+00 | 0.36 | 1.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | āāāāā |
| review_scores_communication | 3312 | 0.47 | 4.920000e+00 | 0.35 | 1.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | āāāāā |
| review_scores_location | 3312 | 0.47 | 4.860000e+00 | 0.38 | 1.000000e+00 | 4.890000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | āāāāā |
| review_scores_value | 3312 | 0.47 | 4.800000e+00 | 0.48 | 1.000000e+00 | 4.800000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | āāāāā |
| calculated_host_listings_count | 0 | 1.00 | 5.910000e+00 | 6.12 | 1.000000e+00 | 1.000000e+00 | 4.000000e+00 | 8.000000e+00 | 3.300000e+01 | āāāāā |
| calculated_host_listings_count_entire_homes | 0 | 1.00 | 2.350000e+00 | 3.83 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 2.000000e+00 | 3.100000e+01 | āāāāā |
| calculated_host_listings_count_private_rooms | 0 | 1.00 | 3.490000e+00 | 5.33 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 5.000000e+00 | 3.300000e+01 | āāāāā |
| calculated_host_listings_count_shared_rooms | 0 | 1.00 | 7.000000e-02 | 0.64 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 9.000000e+00 | āāāāā |
| reviews_per_month | 3203 | 0.49 | 5.200000e-01 | 0.84 | 2.000000e-02 | 1.100000e-01 | 2.600000e-01 | 6.100000e-01 | 1.600000e+01 | āāāāā |
# Price (per bedroom) distribution by room type
listings %>%
filter (!is.na(room_type)) %>%
mutate ( price_per_bedroom := price/bedrooms ) %>%
ggplot(aes(x=price_per_bedroom, colour = room_type, alpha=0.4)) +
geom_histogram() +
facet_wrap(~room_type)+
theme_bw() +
labs (title = "Price Distribution by Room Type")Most of the accommodations available are Entire home/apartment. One very simple reason for that is that sharing rooms in Beijing is quite uncommon, although in this graph we donāt see a huge disparity in the price between these different types of accommodation since all distributions are skewed to the right.
# Box plot of price per bedroom by neighbourhoods
listings %>%
filter (!is.na(neighbourhood_cleansed)) %>%
mutate ( price_per_bedroom := price/bedrooms ) %>%
ggplot(aes( x = factor(neighbourhood_cleansed))) +
geom_boxplot(aes(y = price_per_bedroom)) +
theme( axis.text.x = element_text( angle= 45, hjust = 1)) +
scale_y_continuous(limits = c(0,2500)) +
labs(title = "Box Plot of Price per Bedroom by Neighbourhoods")listings_1 <- listings %>%
count(neighbourhood_cleansed)
listings_1 %>%
slice_max(order_by = n, n=25) %>%
ggplot(aes(x = n, y = fct_reorder(neighbourhood_cleansed, n))) +
geom_col() +
labs(
title = "Neighbourhood ranked by number of listings",
x = "Number of Listings",
y = "Neighbourhood")From the mapping, we have figured out Airbnb listings in Beijing are concentrated in city centre and the northern region around HuaiRou District. The popularity is explained by the convenience to commute in the centre of Beijing and the costly expense for an alternative means of lodging such as hotel. Meanwhile, there are so many scenic spots on the northern side far from city centres (around 2 hours of commuting) and the properties are more appealing with nice views. So tourists prefer to stay at Airbnb if they want to make a visit to these spots.
# Correlation matrix of key variables
listings <- listings %>%
mutate(log_price := log(price)) # Mutate a new column showing log price
ggpairs(listings, columns = c("log_price", "accommodates", "bedrooms", "availability_30", "availability_60", "review_scores_rating", "beds", "number_of_reviews", "minimum_nights")) The highest correlations seem to be between beds, bedrooms and accomodates which is obvious. Another interesting but logical correlation is between price and accomodates: a price for an apartment that can accomodate 8 people is logically more expensive than one that accomodates only 2 people.
# Box plot of price per bedroom by whether the host is super host
listings %>%
filter (!is.na(host_is_superhost)) %>%
mutate ( price_per_bedroom := price/bedrooms ) %>%
ggplot(aes( x = factor(host_is_superhost))) +
geom_boxplot(aes(y = price_per_bedroom)) +
theme( axis.text.x = element_text( angle= 45, hjust = 1)) +
scale_y_continuous(limits = c(0,2500)) +
labs(title = "Box Plot of Price per Bedroom by the host") The second one is box plots describing the relationship between prices and whether hosts are super hosts. We can see a small difference, but surprisingly not that big, meaning that being a superhost doesnāt not increase demand for their accommodation that much doesnāt seem to impact the prices. But weāll try to confirm or deny that later on in the regression analysis.
listings <- listings %>%
mutate(prop_type_simplified = case_when(
property_type %in% c("Entire villa","Entire residential home", "Farm stay","Private room in farm stay") ~ property_type,
TRUE ~ "Other"
))The most common property types are Entire VIlla, Entire residnetial home, Farm stay, and Private room in farm stay which is coherent with the first graph we plotted.
listings %>%
count(property_type, prop_type_simplified) %>%
arrange(desc(n)) | property_type | prop_type_simplified | n |
|---|---|---|
| Entire villa | Entire villa | 812 |
| Entire residential home | Entire residential home | 620 |
| Farm stay | Farm stay | 618 |
| Private room in farm stay | Private room in farm stay | 556 |
| Entire cottage | Other | 516 |
| Private room in kezhan | Other | 375 |
| Private room in villa | Other | 290 |
| Room in boutique hotel | Other | 279 |
| Private room in residential home | Other | 273 |
| Room in hotel | Other | 233 |
| Entire bungalow | Other | 217 |
| Private room in cottage | Other | 207 |
| Entire rental unit | Other | 156 |
| Entire townhouse | Other | 135 |
| Private room in bed and breakfast | Other | 118 |
| Private room in serviced apartment | Other | 84 |
| Private room in resort | Other | 80 |
| Private room in bungalow | Other | 68 |
| Entire loft | Other | 61 |
| Kezhan | Other | 60 |
| Private room in nature lodge | Other | 53 |
| Private room in townhouse | Other | 46 |
| Entire cabin | Other | 40 |
| Private room in rental unit | Other | 40 |
| Shared room in hostel | Other | 32 |
| Entire condominium (condo) | Other | 28 |
| Entire serviced apartment | Other | 28 |
| Private room in hostel | Other | 26 |
| Private room | Other | 23 |
| Earth house | Other | 22 |
| Room in aparthotel | Other | 17 |
| Private room in loft | Other | 15 |
| Private room in guesthouse | Other | 14 |
| Entire place | Other | 11 |
| Entire chalet | Other | 10 |
| Campsite | Other | 9 |
| Private room in earth house | Other | 7 |
| Entire bed and breakfast | Other | 6 |
| Entire home/apt | Other | 6 |
| Private room in cabin | Other | 6 |
| Private room in cave | Other | 5 |
| Private room in guest suite | Other | 5 |
| Private room in minsu | Other | 5 |
| Barn | Other | 4 |
| Entire guest suite | Other | 4 |
| Private room in barn | Other | 4 |
| Private room in castle | Other | 4 |
| Ranch | Other | 4 |
| Shared room in bed and breakfast | Other | 4 |
| Shared room in cottage | Other | 4 |
| Shared room in farm stay | Other | 4 |
| Shared room in kezhan | Other | 4 |
| Casa particular | Other | 3 |
| Minsu | Other | 3 |
| Private room in condominium (condo) | Other | 3 |
| Shared room in boutique hotel | Other | 3 |
| Tiny house | Other | 3 |
| Castle | Other | 2 |
| Entire guesthouse | Other | 2 |
| Entire resort | Other | 2 |
| Hut | Other | 2 |
| Private room in hut | Other | 2 |
| Private room in ranch | Other | 2 |
| Private room in tiny house | Other | 2 |
| Private room in treehouse | Other | 2 |
| Shared room in rental unit | Other | 2 |
| Shared room in villa | Other | 2 |
| Cave | Other | 1 |
| Entire hostel | Other | 1 |
| Holiday park | Other | 1 |
| Houseboat | Other | 1 |
| Pension | Other | 1 |
| Private room in camper/rv | Other | 1 |
| Private room in dome house | Other | 1 |
| Private room in ryokan | Other | 1 |
| Private room in shipping container | Other | 1 |
| Riad | Other | 1 |
| Shared room in earth house | Other | 1 |
| Shared room in townhouse | Other | 1 |
| Treehouse | Other | 1 |
Airbnb is most commonly used for travel purposes, i.e., as an alternative to traditional hotels. We only want to include listings in our regression analysis that are intended for travel purposes:
listings %>%
count(minimum_nights)| minimum_nights | n |
|---|---|
| 1 | 6197 |
| 2 | 44 |
| 3 | 8 |
| 4 | 1 |
| 5 | 2 |
| 7 | 5 |
| 10 | 8 |
| 15 | 2 |
| 29 | 7 |
| 30 | 18 |
| 360 | 1 |
| 365 | 3 |
There are some unusual figures for minimum_nights such as, Airbnb does this to encourage customers to stay longer and spend more money.`
listings <- listings %>%
filter(minimum_nights <= 4)leaflet(data = filter(listings, minimum_nights <= 4)) %>%
addProviderTiles("OpenStreetMap.Mapnik") %>%
addCircleMarkers(lng = ~longitude,
lat = ~latitude,
radius = 1,
fillColor = "blue",
fillOpacity = 0.4,
popup = ~listing_url,
label = ~property_type)For the target variable \(Y\), we will use the cost for two people to stay at an Airbnb location for four (4) nights.
Create a new variable called price_4_nights that uses price, and accomodates to calculate the total cost for two people to stay at the Airbnb property for 4 nights. This is the variable \(Y\) we want to explain.
listings <- listings %>%
mutate(price_4_nights = (price/accommodates)*2*4)
#First calculate the unit price then 4 nights then 2 people Use histograms or density plots to examine the distributions of price_4_nights and log(price_4_nights). Which variable should you use for the regression model? Why?
listings %>%
ggplot()+
geom_density(aes(x=price_4_nights)) listings %>%
ggplot()+
geom_density(aes(x=log(price_4_nights)))#Density distribution of raw price_4_nights and log(price_4_nights)Linear regression assumes normal distributions for variables used. Since Log(price_4_nights) fits the normal distribution much better, it should be used instead of price_4_nights variable.
We fitted a regression model called model1 with the following explanatory variables: prop_type_simplified, number_of_reviews, and review_scores_rating.
model1 <- lm(log(price_4_nights)~prop_type_simplified + number_of_reviews+review_scores_rating,data = listings)
msummary(model1) Estimate Std. Error t value
(Intercept) 7.6918165 0.0640998 119.998
prop_type_simplifiedEntire villa 0.1473064 0.0463454 3.178
prop_type_simplifiedFarm stay -0.2983630 0.0501030 -5.955
prop_type_simplifiedOther -0.0419324 0.0374138 -1.121
prop_type_simplifiedPrivate room in farm stay -0.8578970 0.0572267 -14.991
number_of_reviews -0.0012475 0.0007375 -1.692
review_scores_rating 0.0125218 0.0115063 1.088
Pr(>|t|)
(Intercept) < 2e-16 ***
prop_type_simplifiedEntire villa 0.0015 **
prop_type_simplifiedFarm stay 2.89e-09 ***
prop_type_simplifiedOther 0.2625
prop_type_simplifiedPrivate room in farm stay < 2e-16 ***
number_of_reviews 0.0908 .
review_scores_rating 0.2766
Residual standard error: 0.6338 on 3074 degrees of freedom
(3169 observations deleted due to missingness)
Multiple R-squared: 0.1125, Adjusted R-squared: 0.1108
F-statistic: 64.93 on 6 and 3074 DF, p-value: < 2.2e-16
āreview_scores_ratingā is not a significant indicator of log(price_4_nights) since its P values is 0.27 which is way higher than the 0.05 threshold.
Since āprop_type_simplifiedā variable is categorical we need to interpret every type of properties:
We want to determine if room_type is a significant predictor of the cost for 4 nights, given everything else in the model. .
model2 <- lm(log(price_4_nights)~prop_type_simplified + number_of_reviews+review_scores_rating+ room_type,data = listings)
msummary(model2) Estimate Std. Error t value
(Intercept) 7.7010521 0.0635534 121.175
prop_type_simplifiedEntire villa 0.1471316 0.0459346 3.203
prop_type_simplifiedFarm stay -0.2981248 0.0496591 -6.003
prop_type_simplifiedOther 0.0723514 0.0404026 1.791
prop_type_simplifiedPrivate room in farm stay -0.6563589 0.0640321 -10.250
number_of_reviews -0.0010252 0.0007354 -1.394
review_scores_rating 0.0102848 0.0114091 0.901
room_typePrivate room -0.2010158 0.0297561 -6.755
room_typeShared room -0.4593022 0.1047868 -4.383
Pr(>|t|)
(Intercept) < 2e-16 ***
prop_type_simplifiedEntire villa 0.00137 **
prop_type_simplifiedFarm stay 2.16e-09 ***
prop_type_simplifiedOther 0.07343 .
prop_type_simplifiedPrivate room in farm stay < 2e-16 ***
number_of_reviews 0.16337
review_scores_rating 0.36741
room_typePrivate room 1.70e-11 ***
room_typeShared room 1.21e-05 ***
Residual standard error: 0.6282 on 3072 degrees of freedom
(3169 observations deleted due to missingness)
Multiple R-squared: 0.1287, Adjusted R-squared: 0.1264
F-statistic: 56.73 on 8 and 3072 DF, p-value: < 2.2e-16
In the model2, we can observe that room_type is important for both āprivate roomā and āshared roomā categories since both of their p value is lower than 0.05.
bathrooms, bedrooms, beds, or size of the house (accomodates) significant predictors of price_4_nights? Or might these be co-linear variables?model_3 <- lm(log(price_4_nights)~bedrooms+beds+accommodates,data = listings)
msummary(model_3) Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.760921 0.016737 463.704 < 2e-16 ***
bedrooms 0.056822 0.008474 6.705 2.19e-11 ***
beds -0.026947 0.003826 -7.042 2.09e-12 ***
accommodates -0.018089 0.003684 -4.911 9.32e-07 ***
Residual standard error: 0.7622 on 6168 degrees of freedom
(78 observations deleted due to missingness)
Multiple R-squared: 0.01607, Adjusted R-squared: 0.01559
F-statistic: 33.57 on 3 and 6168 DF, p-value: < 2.2e-16
model_3 %>%
car::vif(model_3) bedrooms beds accommodates
4.744730 2.926654 3.776660
In Beijing data set ābathroomsā variable is empty. Therefore, the analysis is done with ābedroomsā, ābedsā and āaccommodatesā variables. All these variables are found significant.
When we investigate VIFs, we see that none of them is higher than 5. Although, ābedroomsā variable is close to 5 with a VIF value of 4.7. So these could potentially be co-linear variables. Therefore, we will add beds and accommodates but not the bedrooms. These variables will be added at the very end to not affect the collinearity in other models.
(host_is_superhost) command a pricing premium, after controlling for other variables?model_4 <- lm(log(price_4_nights)~prop_type_simplified + number_of_reviews+review_scores_rating+ room_type+accommodates+host_is_superhost,data = listings)
msummary(model_4) Estimate Std. Error t value
(Intercept) 7.9396873 0.0683955 116.085
prop_type_simplifiedEntire villa 0.1747360 0.0454328 3.846
prop_type_simplifiedFarm stay -0.3087231 0.0489801 -6.303
prop_type_simplifiedOther 0.0225057 0.0401533 0.560
prop_type_simplifiedPrivate room in farm stay -0.6753030 0.0631445 -10.695
number_of_reviews -0.0018659 0.0007367 -2.533
review_scores_rating 0.0057109 0.0113171 0.505
room_typePrivate room -0.3008144 0.0313469 -9.596
room_typeShared room -0.5664130 0.1047953 -5.405
accommodates -0.0234069 0.0026911 -8.698
host_is_superhostTRUE 0.0844251 0.0243123 3.473
Pr(>|t|)
(Intercept) < 2e-16 ***
prop_type_simplifiedEntire villa 0.000122 ***
prop_type_simplifiedFarm stay 3.34e-10 ***
prop_type_simplifiedOther 0.575183
prop_type_simplifiedPrivate room in farm stay < 2e-16 ***
number_of_reviews 0.011364 *
review_scores_rating 0.613856
room_typePrivate room < 2e-16 ***
room_typeShared room 6.98e-08 ***
accommodates < 2e-16 ***
host_is_superhostTRUE 0.000523 ***
Residual standard error: 0.619 on 3070 degrees of freedom
(3169 observations deleted due to missingness)
Multiple R-squared: 0.1543, Adjusted R-squared: 0.1516
F-statistic: 56.03 on 10 and 3070 DF, p-value: < 2.2e-16
Yes, actually it is safe to say super hosts command a price premium since it is a significant variable in the model and its beta is positive.
instant_bookable == TRUE), while a non-trivial proportion donāt. After controlling for other variables, is instant_bookable a significant predictor of price_4_nights?model_5 <- lm(log(price_4_nights)~prop_type_simplified + number_of_reviews+review_scores_rating+ room_type+accommodates+host_is_superhost+instant_bookable,data = listings)
msummary(model_5) Estimate Std. Error t value
(Intercept) 7.8873138 0.0693186 113.783
prop_type_simplifiedEntire villa 0.1725385 0.0453109 3.808
prop_type_simplifiedFarm stay -0.3091861 0.0488456 -6.330
prop_type_simplifiedOther 0.0237491 0.0400440 0.593
prop_type_simplifiedPrivate room in farm stay -0.6645221 0.0630224 -10.544
number_of_reviews -0.0018775 0.0007347 -2.556
review_scores_rating 0.0025309 0.0113109 0.224
room_typePrivate room -0.2997031 0.0312619 -9.587
room_typeShared room -0.5794905 0.1045527 -5.543
accommodates -0.0236088 0.0026841 -8.796
host_is_superhostTRUE 0.0733655 0.0243856 3.009
instant_bookableTRUE 0.1031757 0.0243532 4.237
Pr(>|t|)
(Intercept) < 2e-16 ***
prop_type_simplifiedEntire villa 0.000143 ***
prop_type_simplifiedFarm stay 2.81e-10 ***
prop_type_simplifiedOther 0.553175
prop_type_simplifiedPrivate room in farm stay < 2e-16 ***
number_of_reviews 0.010649 *
review_scores_rating 0.822963
room_typePrivate room < 2e-16 ***
room_typeShared room 3.23e-08 ***
accommodates < 2e-16 ***
host_is_superhostTRUE 0.002646 **
instant_bookableTRUE 2.34e-05 ***
Residual standard error: 0.6173 on 3069 degrees of freedom
(3169 observations deleted due to missingness)
Multiple R-squared: 0.1593, Adjusted R-squared: 0.1562
F-statistic: 52.85 on 11 and 3069 DF, p-value: < 2.2e-16
model_5 %>%
car::vif(model_5) GVIF Df GVIF^(1/(2*Df))
prop_type_simplified 1.748655 4 1.072354
number_of_reviews 1.062425 1 1.030740
review_scores_rating 1.024080 1 1.011969
room_type 1.903971 2 1.174668
accommodates 1.457615 1 1.207317
host_is_superhost 1.083429 1 1.040879
instant_bookable 1.024292 1 1.012073
neighbourhood, neighbourhood_cleansed, and neighbourhood_group_cleansed. There are typically more than 20 neighbourhoods in each city, and it wouldnāt make sense to include them all in your model. Use your city knowledge, or ask someone with city knowledge, and see whether you can group neighbourhoods together so the majority of listings falls in fewer (5-6 max) geographical areas. You would thus need to create a new categorical variabale neighbourhood_simplified and determine whether location is a predictor of price_4_nights#Reducing the categories for the neighbourhood
listings <- listings %>%
mutate(neighbourhood_simplified = case_when(
neighbourhood_cleansed %in% c("ęęåŗ / Huairou","å»¶åŗåæ / Yanqing", "åÆäŗåæ / Miyun") ~ neighbourhood_cleansed,
TRUE ~ "Other"
))
# Checking our results
listings %>%
count(neighbourhood_cleansed, neighbourhood_simplified) %>%
arrange(desc(n)) | neighbourhood_cleansed | neighbourhood_simplified | n |
|---|---|---|
| ęęåŗ / Huairou | ęęåŗ / Huairou | 1770 |
| å»¶åŗåæ / Yanqing | å»¶åŗåæ / Yanqing | 1550 |
| åÆäŗåæ / Miyun | åÆäŗåæ / Miyun | 976 |
| ęæå±±åŗ | Other | 489 |
| ęå¹³åŗ | Other | 279 |
| äøååŗ | Other | 278 |
| å¹³č°·åŗ / Pinggu | Other | 249 |
| éØå¤“ę²åŗ / Mentougou | Other | 140 |
| éå·åŗ / Tongzhou | Other | 136 |
| ęé³åŗ / Chaoyang | Other | 94 |
| 脿ååŗ | Other | 89 |
| 锺ä¹åŗ / Shunyi | Other | 82 |
| å¤§å “åŗ / Daxing | Other | 47 |
| ęµ·ę·åŗ | Other | 42 |
| äø°å°åŗ / Fengtai | Other | 27 |
| ē³ęÆå±±åŗ | Other | 2 |
#Creating a 6th model
model_6 <- lm(log(price_4_nights)~prop_type_simplified + number_of_reviews+review_scores_rating+ room_type+accommodates+host_is_superhost+instant_bookable+neighbourhood_simplified,data = listings)
msummary(model_6) Estimate Std. Error t value
(Intercept) 7.8380388 0.0697560 112.364
prop_type_simplifiedEntire villa 0.2105253 0.0448471 4.694
prop_type_simplifiedFarm stay -0.3002228 0.0481599 -6.234
prop_type_simplifiedOther 0.0368615 0.0394976 0.933
prop_type_simplifiedPrivate room in farm stay -0.6404506 0.0627714 -10.203
number_of_reviews -0.0015691 0.0007292 -2.152
review_scores_rating 0.0016299 0.0111431 0.146
room_typePrivate room -0.3137658 0.0312073 -10.054
room_typeShared room -0.5437272 0.1037815 -5.239
accommodates -0.0265040 0.0026652 -9.945
host_is_superhostTRUE 0.0774135 0.0240314 3.221
instant_bookableTRUE 0.1045542 0.0241070 4.337
neighbourhood_simplifiedåÆäŗåæ / Miyun -0.0490435 0.0340834 -1.439
neighbourhood_simplifiedå»¶åŗåæ / Yanqing 0.0195919 0.0307229 0.638
neighbourhood_simplifiedęęåŗ / Huairou 0.2363871 0.0291593 8.107
Pr(>|t|)
(Intercept) < 2e-16 ***
prop_type_simplifiedEntire villa 2.79e-06 ***
prop_type_simplifiedFarm stay 5.17e-10 ***
prop_type_simplifiedOther 0.35076
prop_type_simplifiedPrivate room in farm stay < 2e-16 ***
number_of_reviews 0.03149 *
review_scores_rating 0.88372
room_typePrivate room < 2e-16 ***
room_typeShared room 1.72e-07 ***
accommodates < 2e-16 ***
host_is_superhostTRUE 0.00129 **
instant_bookableTRUE 1.49e-05 ***
neighbourhood_simplifiedåÆäŗåæ / Miyun 0.15027
neighbourhood_simplifiedå»¶åŗåæ / Yanqing 0.52372
neighbourhood_simplifiedęęåŗ / Huairou 7.44e-16 ***
Residual standard error: 0.6081 on 3066 degrees of freedom
(3169 observations deleted due to missingness)
Multiple R-squared: 0.1849, Adjusted R-squared: 0.1812
F-statistic: 49.69 on 14 and 3066 DF, p-value: < 2.2e-16
model_6 %>%
car::vif(model_6) GVIF Df GVIF^(1/(2*Df))
prop_type_simplified 1.813047 4 1.077212
number_of_reviews 1.078582 1 1.038548
review_scores_rating 1.024223 1 1.012039
room_type 1.974884 2 1.185456
accommodates 1.480890 1 1.216918
host_is_superhost 1.084273 1 1.041284
instant_bookable 1.034293 1 1.017002
neighbourhood_simplified 1.139028 3 1.021933
The districts are divided into four according to the knowledge of the group members. These districts are Huairou, Yanqing, Miyun and others.
Only Huairou district is significant and it demands a premium on price. This makes sense since Huairou is a high quality district and quite popular on social media.
avalability_30 or reviews_per_month on price_4_nights, after we control for other variables?model_7 <- lm(log(price_4_nights)~prop_type_simplified + number_of_reviews+review_scores_rating+ room_type+accommodates+host_is_superhost+instant_bookable+neighbourhood_simplified+availability_30+reviews_per_month,data = listings)
msummary(model_7) Estimate Std. Error t value
(Intercept) 7.7513655 0.0731219 106.006
prop_type_simplifiedEntire villa 0.2041332 0.0446815 4.569
prop_type_simplifiedFarm stay -0.2920707 0.0479909 -6.086
prop_type_simplifiedOther 0.0381457 0.0393449 0.970
prop_type_simplifiedPrivate room in farm stay -0.6278441 0.0625751 -10.033
number_of_reviews -0.0034081 0.0008401 -4.057
review_scores_rating 0.0008305 0.0111014 0.075
room_typePrivate room -0.3118790 0.0310819 -10.034
room_typeShared room -0.5505730 0.1038216 -5.303
accommodates -0.0268431 0.0026568 -10.104
host_is_superhostTRUE 0.0740687 0.0239481 3.093
instant_bookableTRUE 0.0847914 0.0243592 3.481
neighbourhood_simplifiedåÆäŗåæ / Miyun -0.0396342 0.0342567 -1.157
neighbourhood_simplifiedå»¶åŗåæ / Yanqing 0.0320826 0.0307525 1.043
neighbourhood_simplifiedęęåŗ / Huairou 0.2508672 0.0292427 8.579
availability_30 0.0036026 0.0012986 2.774
reviews_per_month 0.0686400 0.0156278 4.392
Pr(>|t|)
(Intercept) < 2e-16 ***
prop_type_simplifiedEntire villa 5.10e-06 ***
prop_type_simplifiedFarm stay 1.30e-09 ***
prop_type_simplifiedOther 0.332362
prop_type_simplifiedPrivate room in farm stay < 2e-16 ***
number_of_reviews 5.10e-05 ***
review_scores_rating 0.940372
room_typePrivate room < 2e-16 ***
room_typeShared room 1.22e-07 ***
accommodates < 2e-16 ***
host_is_superhostTRUE 0.002000 **
instant_bookableTRUE 0.000507 ***
neighbourhood_simplifiedåÆäŗåæ / Miyun 0.247373
neighbourhood_simplifiedå»¶åŗåæ / Yanqing 0.296913
neighbourhood_simplifiedęęåŗ / Huairou < 2e-16 ***
availability_30 0.005569 **
reviews_per_month 1.16e-05 ***
Residual standard error: 0.6057 on 3064 degrees of freedom
(3169 observations deleted due to missingness)
Multiple R-squared: 0.1921, Adjusted R-squared: 0.1879
F-statistic: 45.54 on 16 and 3064 DF, p-value: < 2.2e-16
model_7 %>%
car::vif(model_7) GVIF Df GVIF^(1/(2*Df))
prop_type_simplified 1.828539 4 1.078358
number_of_reviews 1.443470 1 1.201445
review_scores_rating 1.024933 1 1.012390
room_type 1.994068 2 1.188324
accommodates 1.483677 1 1.218063
host_is_superhost 1.085628 1 1.041935
instant_bookable 1.064740 1 1.031862
neighbourhood_simplified 1.169792 3 1.026482
availability_30 1.049162 1 1.024286
reviews_per_month 1.443963 1 1.201650
As you keep building your models, it makes sense to:
model_final <- lm(log(price_4_nights)~prop_type_simplified + number_of_reviews+ room_type+accommodates+host_is_superhost+instant_bookable+neighbourhood_simplified+availability_30+reviews_per_month+beds,data = listings)
#For the final model review_scores_rating is dropped since it was not a significant variables.
#Beds is added to the model and it doesn't create a significant collinearity problem
msummary(model_final) Estimate Std. Error t value
(Intercept) 7.7473387 0.0538902 143.762
prop_type_simplifiedEntire villa 0.2075827 0.0446161 4.653
prop_type_simplifiedFarm stay -0.2923000 0.0479006 -6.102
prop_type_simplifiedOther 0.0371538 0.0392765 0.946
prop_type_simplifiedPrivate room in farm stay -0.6190147 0.0625247 -9.900
number_of_reviews -0.0033187 0.0008387 -3.957
room_typePrivate room -0.3082612 0.0311559 -9.894
room_typeShared room -0.5211982 0.1045836 -4.984
accommodates -0.0208866 0.0038007 -5.495
host_is_superhostTRUE 0.0745753 0.0237817 3.136
instant_bookableTRUE 0.0865071 0.0242825 3.563
neighbourhood_simplifiedåÆäŗåæ / Miyun -0.0403816 0.0341986 -1.181
neighbourhood_simplifiedå»¶åŗåæ / Yanqing 0.0311728 0.0307198 1.015
neighbourhood_simplifiedęęåŗ / Huairou 0.2470570 0.0292239 8.454
availability_30 0.0036350 0.0012978 2.801
reviews_per_month 0.0670759 0.0156150 4.296
beds -0.0090521 0.0042276 -2.141
Pr(>|t|)
(Intercept) < 2e-16 ***
prop_type_simplifiedEntire villa 3.42e-06 ***
prop_type_simplifiedFarm stay 1.18e-09 ***
prop_type_simplifiedOther 0.344247
prop_type_simplifiedPrivate room in farm stay < 2e-16 ***
number_of_reviews 7.76e-05 ***
room_typePrivate room < 2e-16 ***
room_typeShared room 6.59e-07 ***
accommodates 4.22e-08 ***
host_is_superhostTRUE 0.001730 **
instant_bookableTRUE 0.000373 ***
neighbourhood_simplifiedåÆäŗåæ / Miyun 0.237775
neighbourhood_simplifiedå»¶åŗåæ / Yanqing 0.310308
neighbourhood_simplifiedęęåŗ / Huairou < 2e-16 ***
availability_30 0.005130 **
reviews_per_month 1.80e-05 ***
beds 0.032335 *
Residual standard error: 0.6046 on 3061 degrees of freedom
(3172 observations deleted due to missingness)
Multiple R-squared: 0.1939, Adjusted R-squared: 0.1897
F-statistic: 46.01 on 16 and 3061 DF, p-value: < 2.2e-16
model_final %>%
car::vif(model_final) GVIF Df GVIF^(1/(2*Df))
prop_type_simplified 1.840573 4 1.079243
number_of_reviews 1.443587 1 1.201494
room_type 2.036250 2 1.194559
accommodates 3.043999 1 1.744706
host_is_superhost 1.073473 1 1.036085
instant_bookable 1.060213 1 1.029667
neighbourhood_simplified 1.171404 3 1.026718
availability_30 1.049500 1 1.024451
reviews_per_month 1.446779 1 1.202821
beds 2.444336 1 1.563437
autoplot(model_x)#Final Model
model_final %>%
car::vif(model_final) GVIF Df GVIF^(1/(2*Df))
prop_type_simplified 1.840573 4 1.079243
number_of_reviews 1.443587 1 1.201494
room_type 2.036250 2 1.194559
accommodates 3.043999 1 1.744706
host_is_superhost 1.073473 1 1.036085
instant_bookable 1.060213 1 1.029667
neighbourhood_simplified 1.171404 3 1.026718
availability_30 1.049500 1 1.024451
reviews_per_month 1.446779 1 1.202821
beds 2.444336 1 1.563437
autoplot(model_final)The residual vs Fitted graph doesnāt seem to follow a specific pattern. This means that linearity assumption of regression is checked. The normality graph, while not perfect, seems to fit the normality assumption.
model_final %>%
car::vif(model_final) GVIF Df GVIF^(1/(2*Df))
prop_type_simplified 1.840573 4 1.079243
number_of_reviews 1.443587 1 1.201494
room_type 2.036250 2 1.194559
accommodates 3.043999 1 1.744706
host_is_superhost 1.073473 1 1.036085
instant_bookable 1.060213 1 1.029667
neighbourhood_simplified 1.171404 3 1.026718
availability_30 1.049500 1 1.024451
reviews_per_month 1.446779 1 1.202821
beds 2.444336 1 1.563437
Collinearity is checked at every stage of explanatory variable addition and no significant collinearity problem exists in the data set since all VIF values are significantly below 5.
huxtable:library(huxtable)
huxreg(model1, model2, model_4,model_5,model_6,model_7,model_final)| (1) | (2) | (3) | (4) | (5) | (6) | (7) | |
|---|---|---|---|---|---|---|---|
| (Intercept) | 7.692 *** | 7.701 *** | 7.940 *** | 7.887 *** | 7.838 *** | 7.751 *** | 7.747 *** |
| (0.064) | (0.064) | (0.068) | (0.069) | (0.070) | (0.073) | (0.054) | |
| prop_type_simplifiedEntire villa | 0.147 ** | 0.147 ** | 0.175 *** | 0.173 *** | 0.211 *** | 0.204 *** | 0.208 *** |
| (0.046) | (0.046) | (0.045) | (0.045) | (0.045) | (0.045) | (0.045) | |
| prop_type_simplifiedFarm stay | -0.298 *** | -0.298 *** | -0.309 *** | -0.309 *** | -0.300 *** | -0.292 *** | -0.292 *** |
| (0.050) | (0.050) | (0.049) | (0.049) | (0.048) | (0.048) | (0.048) | |
| prop_type_simplifiedOther | -0.042 | 0.072 | 0.023 | 0.024 | 0.037 | 0.038 | 0.037 |
| (0.037) | (0.040) | (0.040) | (0.040) | (0.039) | (0.039) | (0.039) | |
| prop_type_simplifiedPrivate room in farm stay | -0.858 *** | -0.656 *** | -0.675 *** | -0.665 *** | -0.640 *** | -0.628 *** | -0.619 *** |
| (0.057) | (0.064) | (0.063) | (0.063) | (0.063) | (0.063) | (0.063) | |
| number_of_reviews | -0.001 | -0.001 | -0.002 * | -0.002 * | -0.002 * | -0.003 *** | -0.003 *** |
| (0.001) | (0.001) | (0.001) | (0.001) | (0.001) | (0.001) | (0.001) | |
| review_scores_rating | 0.013 | 0.010 | 0.006 | 0.003 | 0.002 | 0.001 | |
| (0.012) | (0.011) | (0.011) | (0.011) | (0.011) | (0.011) | ||
| room_typePrivate room | -0.201 *** | -0.301 *** | -0.300 *** | -0.314 *** | -0.312 *** | -0.308 *** | |
| (0.030) | (0.031) | (0.031) | (0.031) | (0.031) | (0.031) | ||
| room_typeShared room | -0.459 *** | -0.566 *** | -0.579 *** | -0.544 *** | -0.551 *** | -0.521 *** | |
| (0.105) | (0.105) | (0.105) | (0.104) | (0.104) | (0.105) | ||
| accommodates | -0.023 *** | -0.024 *** | -0.027 *** | -0.027 *** | -0.021 *** | ||
| (0.003) | (0.003) | (0.003) | (0.003) | (0.004) | |||
| host_is_superhostTRUE | 0.084 *** | 0.073 ** | 0.077 ** | 0.074 ** | 0.075 ** | ||
| (0.024) | (0.024) | (0.024) | (0.024) | (0.024) | |||
| instant_bookableTRUE | 0.103 *** | 0.105 *** | 0.085 *** | 0.087 *** | |||
| (0.024) | (0.024) | (0.024) | (0.024) | ||||
| neighbourhood_simplifiedåÆäŗåæ / Miyun | -0.049 | -0.040 | -0.040 | ||||
| (0.034) | (0.034) | (0.034) | |||||
| neighbourhood_simplifiedå»¶åŗåæ / Yanqing | 0.020 | 0.032 | 0.031 | ||||
| (0.031) | (0.031) | (0.031) | |||||
| neighbourhood_simplifiedęęåŗ / Huairou | 0.236 *** | 0.251 *** | 0.247 *** | ||||
| (0.029) | (0.029) | (0.029) | |||||
| availability_30 | 0.004 ** | 0.004 ** | |||||
| (0.001) | (0.001) | ||||||
| reviews_per_month | 0.069 *** | 0.067 *** | |||||
| (0.016) | (0.016) | ||||||
| beds | -0.009 * | ||||||
| (0.004) | |||||||
| N | 3081 | 3081 | 3081 | 3081 | 3081 | 3081 | 3078 |
| R2 | 0.112 | 0.129 | 0.154 | 0.159 | 0.185 | 0.192 | 0.194 |
| logLik | -2963.099 | -2934.665 | -2888.684 | -2879.701 | -2831.922 | -2818.295 | -2809.926 |
| AIC | 5942.197 | 5889.330 | 5801.368 | 5785.402 | 5695.844 | 5672.589 | 5655.853 |
| *** p < 0.001; ** p < 0.01; * p < 0.05. | |||||||
filtered_listings <- listings %>%
filter(room_type=="Private room") %>%
filter(number_of_reviews>= 10) %>%
filter(review_scores_rating >= 4.5) #the data set is filtered. Filtered dataframe has 161 obs.
predicted <- data.frame(exp(predict(model_final,filtered_listings,interval = "prediction" ))) #cost and intervals are predicted
predicted %>%
summarise(avg_price_4_nights = mean(fit),upper=mean(upr),lower=mean(lwr)) #average of fit, upr and lower| avg_price_4_nights | upper | lower |
|---|---|---|
| 1.95e+03 | 6.41e+03 | 592 |
predicted$ID <- seq.int(nrow(predicted))
filtered_listings$ID <- seq.int((nrow(filtered_listings)))
filtered_listings_ordered <- filtered_listings %>%
left_join(predicted,by="ID") %>%
arrange(desc(fit))
filtered_listings_ordered$ID <- seq.int((nrow(filtered_listings_ordered)))
#a line graph is created to obeserve how well the data is predicted. Prices above 10000 are filtered since two values were outliers
filtered_listings_ordered %>%
filter(upr<10000) %>%
ggplot(aes(x=ID))+
geom_line(aes(y=fit,col="Fitted Line"))+
geom_line(aes(y=price_4_nights,col="Actal Price"))+
geom_line(aes(y=upr))+
geom_line(aes(y=lwr))+
xlab("Observations")+
ylab("Price for 4 Nights")+
theme_minimal()Utku Odabasi ChloƩ Baubier Jiacheng Zhu Jay Bensal Mengtian Li Yaxin Liu